Map of Madrid with points and color indicating prizes

library(readxl)
data_train <- read_excel("Data/data_train.xlsx")
#install.packages(c("ggplot2", "sf", "dplyr"))

# Load libraries
library(ggplot2)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dplyr)

Exploratory Analysis

Category Variable Description
Continuous precio.house.m2 Property sale price in euros per square meter
antig Age of the property (years)
Ruidos_ext External noise (%)
Mal_olor Pollution or bad odors (%)
Poca_limp Lack of street cleanliness (%)
Malas_comunic Poor communications (%)
Pocas_zonas Few green spaces (%)
Delincuencia Crime (%)
CO Level of CO in the air at the property’s coordinates (standardized values)
NO2 Level of NO2 in the air at the property’s coordinates (standardized values)
Nox Level of Nox in the air at the property’s coordinates (standardized values)
O3 Level of O3 in the air at the property’s coordinates (standardized values)
SO2 Level of SO2 in the air at the property’s coordinates (standardized values)
PM10 Level of CO in the air at the property’s coordinates (standardized values)
Pobl.0_14_div_Poblac.Total Percentage of children between 0 and 14 years in the district
PoblJubilada_div_Poblac.Total Percentage of retired population in the district
Inmigrantes.porc Percentage of immigrant population in the district
sup.const Built area of the property
sup.util Usable area of the property
Categorical barrio/cod_barrio Code or name of the neighborhood in the city of Madrid
distrito/cod_distrito Code or name of the district in the city of Madrid
longitud/latitud Geographical coordinates of the property (longitude and latitude)
dorm Number of bedrooms
banos Number of bathrooms
tipo.casa Type of property
estado Condition of the property
Binary inter.exter Interior or exterior design of the property
ascensor Elevator availability in the building
comercial Indicates if the property is located in a commercial area
casco.historico Indicates if the property is in Madrid’s historic center
M.30 Indicates if the property is within the M-30
Identifier ref.hip.zona Mortgage reference of the area
train_indices ID of the observation
colnames(data_train)
##  [1] "train_indices"                 "barrio"                       
##  [3] "cod_barrio"                    "distrito"                     
##  [5] "cod_distrito"                  "longitud"                     
##  [7] "latitud"                       "precio.house.m2"              
##  [9] "sup.const"                     "sup.util"                     
## [11] "ref.hip.zona"                  "dorm"                         
## [13] "banos"                         "tipo.casa"                    
## [15] "inter.exter"                   "ascensor"                     
## [17] "estado"                        "antig"                        
## [19] "comercial"                     "casco.historico"              
## [21] "Ruidos_ext"                    "Mal_olor"                     
## [23] "Poca_limp"                     "Malas_comunic"                
## [25] "Pocas_zonas"                   "Delincuencia"                 
## [27] "M.30"                          "CO"                           
## [29] "NO2"                           "Nox"                          
## [31] "O3"                            "SO2"                          
## [33] "PM10"                          "Pobl.0_14_div_Poblac.Total"   
## [35] "PoblJubilada_div_Poblac.Total" "Inmigrantes.porc"
# turn identifiers into strings to prevent errors
data_train[, c("train_indices", "cod_barrio", "cod_distrito", "ref.hip.zona ", "casco.historico", "M.30","comercial")] <- lapply(data_train[, c("train_indices", "cod_barrio", "cod_distrito", "ref.hip.zona", "casco.historico", "M.30","comercial")], as.character)
unique_counts <- data.frame(
  Column = names(data_train),
  Unique_Values = sapply(data_train, function(x) length(unique(x)))
)
unique_counts
##                                                      Column Unique_Values
## train_indices                                 train_indices           736
## barrio                                               barrio           118
## cod_barrio                                       cod_barrio           116
## distrito                                           distrito            21
## cod_distrito                                   cod_distrito            21
## longitud                                           longitud           712
## latitud                                             latitud           720
## precio.house.m2                             precio.house.m2           567
## sup.const                                         sup.const           164
## sup.util                                           sup.util           160
## ref.hip.zona                                   ref.hip.zona           581
## dorm                                                   dorm             8
## banos                                                 banos             6
## tipo.casa                                         tipo.casa             6
## inter.exter                                     inter.exter             2
## ascensor                                           ascensor             2
## estado                                               estado             7
## antig                                                 antig           174
## comercial                                         comercial             2
## casco.historico                             casco.historico             2
## Ruidos_ext                                       Ruidos_ext            38
## Mal_olor                                           Mal_olor            43
## Poca_limp                                         Poca_limp            45
## Malas_comunic                                 Malas_comunic            48
## Pocas_zonas                                     Pocas_zonas            57
## Delincuencia                                   Delincuencia            55
## M.30                                                   M.30             2
## CO                                                       CO           691
## NO2                                                     NO2           412
## Nox                                                     Nox           412
## O3                                                       O3           625
## SO2                                                     SO2           736
## PM10                                                   PM10           736
## Pobl.0_14_div_Poblac.Total       Pobl.0_14_div_Poblac.Total            19
## PoblJubilada_div_Poblac.Total PoblJubilada_div_Poblac.Total            19
## Inmigrantes.porc                           Inmigrantes.porc            19
## ref.hip.zona                                  ref.hip.zona            581

Frequency Analysis

plot_hist = function(data_train, type = "both", n, m) {

  # Configure subplots
  par(mfrow = c(n, m),mar = c(10, 4, 4, 1))
  
  # Loop through columns
  for (col in colnames(data_train)) {
    # plot numeric columns
    if (is.numeric(data_train[[col]]) && (type == "numeric" || type == "both")) {
      
      # Create histograms
      hist(data_train[[col]], 
           main = paste(col), 
           xlab = "",
           col = "dodgerblue4")
      
      # plot non-numeric columns
    } else if (!is.numeric(data_train[[col]]) && (type == "categoric" 
                                            || type == "both")) {
      # Adjust x-axis label arrangement for specific cols
      barplot(table(data_train[[col]]), 
              main = paste(col), 
              xlab = "", 
              col = "dodgerblue",
              las = 2)
    }
  }
  
  # Reset the plotting layout
  par(mfrow = c(1, 1))
}
# Numeric hist
plot_hist(data = data_train ,type="numeric", n=3, m=2)

# Numeric hist
plot_hist(data = data_train ,type="categoric", n=3, m=2)

TODO: Conditional histograms for categoricals (barrios)

# Install and load necessary packages
library(reshape2)
library(ggplot2)
library(GGally) # This is required for ggpairs

# Compute correlation matrix
numeric_column_names <- names(data_train)[sapply(data_train, is.numeric)]
corr_mat <- round(cor(data_train[, numeric_column_names]), 2)
melted_corr_mat <- melt(corr_mat)

# Plot correlation heatmap
ggplot(data = melted_corr_mat, aes(x = Var2, y = Var1, fill = value)) + 
  geom_tile() +
  geom_text(aes(label = value), color = "black", size = 4)+
  scale_fill_gradient(low = "white", high = "red") +
  theme_minimal()

# Pairplot using ggpairs
#ggpairs(data_train[, numeric_column_names], 
#        aes(color = data_train$group_var)) 

Multivariate

average_price_per_barrio <- data_train %>%
  group_by(barrio) %>%
  summarise(average_price = mean(`precio.house.m2`, na.rm = TRUE))

ggplot(average_price_per_barrio, aes(x = reorder(barrio, average_price), y = average_price)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip coordinates for better readability if there are many barrios
  labs(title = "Average Price per 2m by Barrio",
       x = "Barrio",
       y = "Average Price per 2m") +
  theme_minimal()

average_price_per_district <- data_train %>%
  group_by(distrito) %>%
  summarise(average_price = mean(`precio.house.m2`, na.rm = TRUE))

ggplot(average_price_per_district, aes(x = reorder(distrito, average_price), y = average_price)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip coordinates for better readability if there are many barrios
  labs(title = "Average Price per 2m by District",
       x = "District",
       y = "Average Price per 2m") +
  theme_minimal()

points_sf <- st_as_sf(data_train, coords = c("longitud", "latitud"), crs = 4326)
points_sf <- points_sf[order(points_sf$precio.house.m2), ]
# Download the map of Madrid (use a simple shapefile, for example)
madrid_map <- st_read("Data/madrid.geojson")
## Reading layer `madrid' from data source 
##   `/Users/Simon 1/Documents/GitHub/uc3m_regression_competition/Data/madrid.geojson' 
##   using driver `GeoJSON'
## Simple feature collection with 128 features and 15 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -3.887647 ymin: 40.31325 xmax: -3.516929 ymax: 40.64444
## Geodetic CRS:  WGS 84
# Plot the map with the points colored by the 'value' variable
ggplot() +
  geom_sf(data = madrid_map, fill = "lightgray", color = "white") +  # Plot map
  geom_sf(data = points_sf, aes(color = precio.house.m2), size = 1.5) +  # Plot points
  scale_color_gradient(low = "#00C5CD", high = "red") +
  labs(title = "Spatial situation and price",
       x = "Longitude", y = "Latitude") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels

### Done with teh help of ChatGPT